Prepration¶

In [2]:
import pandas as pd
import numpy as np
from helper import *
import seaborn as sns
import matplotlib.pyplot as plt
In [ ]:
current_month = pd.Timestamp.now().month
current_year = pd.Timestamp.now().year

cpu_data = get_cpu_table(current_month, current_year)
gpu_data = get_gpu_table(current_month, current_year)

full_relation = get_full_relation(current_month, current_year)
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful

Preview the data

In [4]:
print(f"CPU Data: {cpu_data.shape[0]} rows, {cpu_data.shape[1]} columns")
print(f"GPU Data: {gpu_data.shape[0]} rows, {gpu_data.shape[1]} columns")
print(f"Full Relation Data: {full_relation.shape[0]} rows, {full_relation.shape[1]} columns")
CPU Data: 2346 rows, 28 columns
GPU Data: 618 rows, 13 columns
Full Relation Data: 2702 rows, 70 columns

Data Analasys¶

CPU Dataframe¶

Preview the data¶

Dataframe head¶

In [5]:
# Display the first few rows
print(cpu_data.head())
                     name performance_clockspeed performance_turbospeed  \
0      amd ryzen 9 7940hx                   2.40                   5.20   
1      amd ryzen 9 7945hx                   2.50                   5.40   
2    apple m3 max 14 core                   4.00                   None   
3  via eden (duplicate 1)                   1.20                   None   
4    omap4 espresso board                   1.00                   None   

   performance_cores  performance_threads efficient_clockspeed  \
0               16.0                 32.0                 None   
1               16.0                 32.0                 None   
2               10.0                 10.0                 None   
3                1.0                  1.0                 None   
4                2.0                  2.0                 None   

  efficient_turbospeed  efficient_cores  efficient_threads    tdp  ...  \
0                 None              NaN                NaN  55.00  ...   
1                 None              NaN                NaN  55.00  ...   
2                 None              4.0                4.0  78.00  ...   
3                 None              NaN                NaN   7.00  ...   
4                 None              NaN                NaN   None  ...   

   eff_l2_cache  integer_math floating_point_math find_prime_numbers  \
0          None      207233.0            123369.0              277.0   
1          None      209905.0            125019.0              286.0   
2          None       78626.0            111437.0              518.0   
3          None           NaN                 NaN                NaN   
4          None         549.0               245.0                NaN   

  random_string_sorting data_encryption data_compression physics  \
0               79994.0         43598.0         711597.0  2339.0   
1               79831.0         43118.0         712469.0  2468.0   
2               53210.0         21365.0         461605.0  4276.0   
3                   NaN             NaN              NaN     NaN   
4                 429.0             NaN           3490.0    16.0   

  extended_instructions  single_thread  
0               51729.0           3983  
1               52293.0           4051  
2               21631.0           4774  
3                   NaN             97  
4                  26.0            222  

[5 rows x 28 columns]

Dataframe tail¶

In [6]:
# Display the first few rows
print(cpu_data.tail())
                       name performance_clockspeed performance_turbospeed  \
2341  intel core i9 13900hx                   2.20                   5.40   
2342  intel core i9 14900hx                   2.20                   5.80   
2343     amd ryzen 9 7845hx                   3.00                   5.20   
2344  intel core i9 13980hx                   2.20                   5.60   
2345   amd ryzen 9 7945hx3d                   2.30                   5.40   

      performance_cores  performance_threads efficient_clockspeed  \
2341                8.0                 16.0                 1.60   
2342                8.0                 16.0                 1.60   
2343               12.0                 24.0                 None   
2344                8.0                 16.0                 None   
2345               16.0                 32.0                 None   

     efficient_turbospeed  efficient_cores  efficient_threads    tdp  ...  \
2341                 3.90             16.0               16.0  55.00  ...   
2342                 4.10             16.0               16.0  55.00  ...   
2343                 None              NaN                NaN  55.00  ...   
2344                 4.00             16.0               16.0  55.00  ...   
2345                 None              NaN                NaN  55.00  ...   

      eff_l2_cache  integer_math floating_point_math find_prime_numbers  \
2341   4 x 4096 kb      159985.0            112944.0              195.0   
2342   4 x 4096 kb      163612.0            116486.0              201.0   
2343          None      158561.0             96412.0              330.0   
2344   4 x 4096 kb      168919.0            120442.0              193.0   
2345          None      202909.0            124090.0              450.0   

     random_string_sorting data_encryption data_compression physics  \
2341               60702.0         33080.0         545227.0  2718.0   
2342               62740.0         34092.0         564409.0  2769.0   
2343               62882.0         33507.0         556104.0  2343.0   
2344               67235.0         36066.0         599834.0  2603.0   
2345               82524.0         42822.0         705368.0  4242.0   

     extended_instructions  single_thread  
2341               31300.0           4123  
2342               32655.0           4300  
2343               41953.0           3966  
2344               34546.0           4291  
2345               53381.0           4111  

[5 rows x 28 columns]

Check all the features¶

In [7]:
print(cpu_data.columns)
Index(['name', 'performance_clockspeed', 'performance_turbospeed',
       'performance_cores', 'performance_threads', 'efficient_clockspeed',
       'efficient_turbospeed', 'efficient_cores', 'efficient_threads', 'tdp',
       'multithread_rating', 'single_thread_rating', 'l1_instruction_cache',
       'l1_data_cache', 'l2_cache', 'l3_cache', 'eff_l1_instruction_cache',
       'eff_l1_data_cache', 'eff_l2_cache', 'integer_math',
       'floating_point_math', 'find_prime_numbers', 'random_string_sorting',
       'data_encryption', 'data_compression', 'physics',
       'extended_instructions', 'single_thread'],
      dtype='object')

Check the data types and non-null counts¶

In [8]:
print(cpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2346 entries, 0 to 2345
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2346 non-null   object 
 1   performance_clockspeed    2336 non-null   object 
 2   performance_turbospeed    925 non-null    object 
 3   performance_cores         2257 non-null   float64
 4   performance_threads       2257 non-null   float64
 5   efficient_clockspeed      128 non-null    object 
 6   efficient_turbospeed      114 non-null    object 
 7   efficient_cores           162 non-null    float64
 8   efficient_threads         162 non-null    float64
 9   tdp                       1440 non-null   object 
 10  multithread_rating        2346 non-null   int64  
 11  single_thread_rating      2346 non-null   int64  
 12  l1_instruction_cache      1408 non-null   object 
 13  l1_data_cache             1406 non-null   object 
 14  l2_cache                  1404 non-null   object 
 15  l3_cache                  867 non-null    object 
 16  eff_l1_instruction_cache  102 non-null    object 
 17  eff_l1_data_cache         102 non-null    object 
 18  eff_l2_cache              92 non-null     object 
 19  integer_math              2146 non-null   float64
 20  floating_point_math       2146 non-null   float64
 21  find_prime_numbers        2010 non-null   float64
 22  random_string_sorting     2146 non-null   float64
 23  data_encryption           1154 non-null   float64
 24  data_compression          2146 non-null   float64
 25  physics                   2146 non-null   float64
 26  extended_instructions     2146 non-null   float64
 27  single_thread             2346 non-null   int64  
dtypes: float64(12), int64(3), object(13)
memory usage: 513.3+ KB
None

Look at descriptive statistics¶

In [9]:
print(cpu_data.describe())
       performance_cores  performance_threads  efficient_cores  \
count        2257.000000          2257.000000       162.000000   
mean            4.546743             5.949047         6.808642   
std             2.802396             4.000950         2.879644   
min             1.000000             1.000000         2.000000   
25%             2.000000             4.000000         4.000000   
50%             4.000000             4.000000         8.000000   
75%             8.000000             8.000000         8.000000   
max            32.000000            32.000000        16.000000   

       efficient_threads  multithread_rating  single_thread_rating  \
count         162.000000         2346.000000           2346.000000   
mean            6.975309         5052.590793           1393.645354   
std             3.082611         7324.056972           1014.524862   
min             2.000000           93.000000             95.000000   
25%             4.000000          842.250000            572.500000   
50%             8.000000         2171.500000           1087.500000   
75%             8.000000         5708.000000           1945.000000   
max            16.000000        58008.000000           4785.000000   

        integer_math  floating_point_math  find_prime_numbers  \
count    2146.000000          2146.000000         2010.000000   
mean    21717.505126         11974.847623           24.503483   
std     25396.102263         18184.162714           48.416459   
min       122.000000           166.000000            1.000000   
25%      5158.500000          1985.500000            5.000000   
50%     13529.500000          4760.500000           10.000000   
75%     25373.750000         12913.000000           23.000000   
max    209905.000000        131765.000000          616.000000   

       random_string_sorting  data_encryption  data_compression      physics  \
count            2146.000000      1154.000000       2146.000000  2146.000000   
mean             9684.270270      5993.851820      73231.819199   367.676608   
std             10208.248553      6325.100871      89901.011732   518.734837   
min               294.000000      1027.000000       2023.000000    14.000000   
25%              2917.000000      1860.250000      18286.500000    93.000000   
50%              5860.000000      3257.500000      38382.500000   184.000000   
75%             12584.750000      7589.500000      90601.250000   416.000000   
max             82524.000000     43598.000000     712469.000000  6478.000000   

       extended_instructions  single_thread  
count            2146.000000    2346.000000  
mean             3771.722274    1393.645354  
std              6012.164275    1014.524862  
min                25.000000      95.000000  
25%               539.000000     572.500000  
50%              1352.500000    1087.500000  
75%              3508.250000    1945.000000  
max             53381.000000    4785.000000  

Feature Analysis¶

Overall Performance Ratings¶

Features:

  • multithread_rating, single_thread_rating
Distribution of ratings¶
In [10]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating distribution
sns.histplot(cpu_data['single_thread_rating'], ax=axes[0], color='blue', kde=True)
axes[0].set_title("Single Thread Rating Distribution")
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Frequency')

# Plot multithread_rating distribution
sns.histplot(cpu_data['multithread_rating'], ax=axes[1], color='green', kde=True)
axes[1].set_title("Multithread Rating Distribution")
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
In [11]:
# Generate statistics for single_thread_rating
single_thread_stats = cpu_data['single_thread_rating'].describe()
print("Single Thread Rating Statistics:")
print(single_thread_stats)

# Generate statistics for multithread_rating
multithread_stats = cpu_data['multithread_rating'].describe()
print("\nMultithread Rating Statistics:")
print(multithread_stats)
Single Thread Rating Statistics:
count    2346.000000
mean     1393.645354
std      1014.524862
min        95.000000
25%       572.500000
50%      1087.500000
75%      1945.000000
max      4785.000000
Name: single_thread_rating, dtype: float64

Multithread Rating Statistics:
count     2346.000000
mean      5052.590793
std       7324.056972
min         93.000000
25%        842.250000
50%       2171.500000
75%       5708.000000
max      58008.000000
Name: multithread_rating, dtype: float64
Single vs Multithreaded¶
In [12]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cpu_data, x='single_thread_rating', y='multithread_rating', alpha=0.7)

# Add titles and labels
plt.title("Single Thread Rating vs Multithread Rating", fontsize=16)
plt.xlabel("Single Thread Rating", fontsize=14)
plt.ylabel("Multithread Rating", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()

# Calculate and print the correlation
correlation = cpu_data['single_thread_rating'].corr(cpu_data['multithread_rating'])
print(f"The correlation between single_thread_rating and multithread_rating is: {correlation:.2f}")
No description has been provided for this image
The correlation between single_thread_rating and multithread_rating is: 0.88

Clockspeed metrics¶

Features:

  • performance_clockspeed, performance_turbospeed
  • efficient_clockspeed, efficient_turbospeed
Distribution¶
In [13]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance clockspeed
sns.kdeplot(cpu_data['performance_clockspeed'].dropna(), ax=axes[0, 0], color='blue', fill=True)
axes[0, 0].set_title("Performance Cores' Clockspeed Distribution")
axes[0, 0].set_xlabel('Clockspeed (GHz)')
axes[0, 0].set_ylabel('Density')

# Plot performance turbospeed
sns.kdeplot(cpu_data['performance_turbospeed'].dropna(), ax=axes[0, 1], color='green', fill=True)
axes[0, 1].set_title("Performance Cores' Turbospeed Distribution")
axes[0, 1].set_xlabel('Turbospeed (GHz)')
axes[0, 1].set_ylabel('Density')

# Plot efficient clockspeed
sns.kdeplot(cpu_data['efficient_clockspeed'].dropna(), ax=axes[1, 0], color='red', fill=True)
axes[1, 0].set_title("Efficient Cores' Clockspeed Distribution")
axes[1, 0].set_xlabel('Clockspeed (GHz)')
axes[1, 0].set_ylabel('Density')

# Plot efficient turbospeed
sns.kdeplot(cpu_data['efficient_turbospeed'].dropna(), ax=axes[1, 1], color='purple', fill=True)
axes[1, 1].set_title("Efficient Cores' Turbospeed Distribution")
axes[1, 1].set_xlabel('Turbospeed (GHz)')
axes[1, 1].set_ylabel('Density')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_clockspeed'].min(),
    cpu_data['performance_turbospeed'].min(),
    cpu_data['efficient_clockspeed'].min(),
    cpu_data['efficient_turbospeed'].min(),
)

x_max = max(
    cpu_data['performance_clockspeed'].max(),
    cpu_data['performance_turbospeed'].max(),
    cpu_data['efficient_clockspeed'].max(),
    cpu_data['efficient_turbospeed'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Find the maximum y limit among all plots

# Set common limits
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
Correlation with Performance¶
In [14]:
# Calculate correlations
correlation_performance_single = cpu_data['performance_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_performance_multi = cpu_data['performance_clockspeed'].corr(cpu_data['multithread_rating'])
correlation_efficient_single = cpu_data['efficient_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_efficient_multi = cpu_data['efficient_clockspeed'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_clockspeed and single_thread_rating: {correlation_performance_single:.2f}")
print(f"Correlation between performance_clockspeed and multithread_rating: {correlation_performance_multi:.2f}")
print(f"Correlation between efficient_clockspeed and single_thread_rating: {correlation_efficient_single:.2f}")
print(f"Correlation between efficient_clockspeed and multithread_rating: {correlation_efficient_multi:.2f}")
Correlation between performance_clockspeed and single_thread_rating: 0.60
Correlation between performance_clockspeed and multithread_rating: 0.48
Correlation between efficient_clockspeed and single_thread_rating: 0.20
Correlation between efficient_clockspeed and multithread_rating: 0.14
In [15]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure columns are numeric
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')
cpu_data_clone['single_thread_rating'] = pd.to_numeric(cpu_data_clone['single_thread_rating'], errors='coerce')
cpu_data_clone['multithread_rating'] = pd.to_numeric(cpu_data_clone['multithread_rating'], errors='coerce')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='single_thread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Clockspeed vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Plot performance_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Clockspeed vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='single_thread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Clockspeed vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Plot efficient_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Clockspeed vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Boost impact¶
In [16]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Convert columns to numeric, forcing errors to NaN
cpu_data_clone['performance_turbospeed'] = pd.to_numeric(cpu_data_clone['performance_turbospeed'], errors='coerce')
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_turbospeed'] = pd.to_numeric(cpu_data_clone['efficient_turbospeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')

# Compute turbo boost margins
cpu_data_clone['performance_turbo_boost'] = cpu_data_clone['performance_turbospeed'] - cpu_data_clone['performance_clockspeed']
cpu_data_clone['efficient_turbo_boost'] = cpu_data_clone['efficient_turbospeed'] - cpu_data_clone['efficient_clockspeed']

# Analyze turbo boost impact on single_thread_rating and multithread_rating
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Performance turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='single_thread_rating', ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Performance Turbo Boost vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')

# Performance turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='multithread_rating', ax=axes[0, 1], color='green')
axes[0, 1].set_title('Performance Turbo Boost vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')

# Efficient turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='single_thread_rating', ax=axes[1, 0], color='red')
axes[1, 0].set_title('Efficient Turbo Boost vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')

# Efficient turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='multithread_rating', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Efficient Turbo Boost vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Core & Thread Analysis¶

Features:

  • performance_cores, performance_threads
  • efficient_cores, efficient_threads
Distribution¶
In [17]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance cores
sns.histplot(cpu_data['performance_cores'].dropna(), ax=axes[0, 0], color='blue', kde=True)
axes[0, 0].set_title("Performance Cores Distribution")
axes[0, 0].set_xlabel('Number of Cores')
axes[0, 0].set_ylabel('Frequency')

# Plot performance threads
sns.histplot(cpu_data['performance_threads'].dropna(), ax=axes[0, 1], color='green', kde=True)
axes[0, 1].set_title("Performance Threads Distribution")
axes[0, 1].set_xlabel('Number of Threads')
axes[0, 1].set_ylabel('Frequency')

# Plot efficient cores
sns.histplot(cpu_data['efficient_cores'].dropna(), ax=axes[1, 0], color='red', kde=True)
axes[1, 0].set_title("Efficient Cores Distribution")
axes[1, 0].set_xlabel('Number of Cores')
axes[1, 0].set_ylabel('Frequency')

# Plot efficient threads
sns.histplot(cpu_data['efficient_threads'].dropna(), ax=axes[1, 1], color='purple', kde=True)
axes[1, 1].set_title("Efficient Threads Distribution")
axes[1, 1].set_xlabel('Number of Threads')
axes[1, 1].set_ylabel('Frequency')

# Determine common x and y limits for all plots
x_min = min(
    cpu_data['performance_cores'].min(),
    cpu_data['performance_threads'].min(),
    cpu_data['efficient_cores'].min(),
    cpu_data['efficient_threads'].min(),
)

x_max = max(
    cpu_data['performance_cores'].max(),
    cpu_data['performance_threads'].max(),
    cpu_data['efficient_cores'].max(),
    cpu_data['efficient_threads'].max(),
)

y_max = max(ax.get_ylim()[1] for ax in axes.flat)  # Get the maximum y-limit among all plots

# Set common x and y limits for all subplots
for ax in axes.flat:
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(0, y_max)

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
In [18]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Calculate core/thread ratio for performance and efficient cores
cpu_data_clone['performance_core_thread_ratio'] = cpu_data_clone['performance_cores'] / cpu_data_clone['performance_threads']
cpu_data_clone['efficient_core_thread_ratio'] = cpu_data_clone['efficient_cores'] / cpu_data_clone['efficient_threads']

# Calculate frequency counts for each ratio
performance_ratio_counts = cpu_data_clone['performance_core_thread_ratio'].value_counts().sort_index()
efficient_ratio_counts = cpu_data_clone['efficient_core_thread_ratio'].value_counts().sort_index()

# Print the frequency of core/thread ratios
print("Performance Core/Thread Ratio Frequencies:")
print(performance_ratio_counts)

print("\nEfficient Core/Thread Ratio Frequencies:")
print(efficient_ratio_counts)
Performance Core/Thread Ratio Frequencies:
performance_core_thread_ratio
0.5     773
1.0    1484
Name: count, dtype: int64

Efficient Core/Thread Ratio Frequencies:
efficient_core_thread_ratio
0.5      4
1.0    158
Name: count, dtype: int64
Multi-threading impact¶
In [19]:
# Calculate correlations
correlation_performance_cores = cpu_data['performance_cores'].corr(cpu_data['multithread_rating'])
correlation_performance_threads = cpu_data['performance_threads'].corr(cpu_data['multithread_rating'])
correlation_efficient_cores = cpu_data['efficient_cores'].corr(cpu_data['multithread_rating'])
correlation_efficient_threads = cpu_data['efficient_threads'].corr(cpu_data['multithread_rating'])

# Print the results
print(f"Correlation between performance_cores and multithread_rating: {correlation_performance_cores:.2f}")
print(f"Correlation between performance_threads and multithread_rating: {correlation_performance_threads:.2f}")
print(f"Correlation between efficient_cores and multithread_rating: {correlation_efficient_cores:.2f}")
print(f"Correlation between efficient_threads and multithread_rating: {correlation_efficient_threads:.2f}")
Correlation between performance_cores and multithread_rating: 0.41
Correlation between performance_threads and multithread_rating: 0.74
Correlation between efficient_cores and multithread_rating: 0.47
Correlation between efficient_threads and multithread_rating: 0.50
In [20]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot performance_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_cores', y='multithread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Cores vs Multithread Rating')
axes[0, 0].set_xlabel('Performance Cores')
axes[0, 0].set_ylabel('Multithread Rating')

# Plot performance_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_threads', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Threads vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Threads')
axes[0, 1].set_ylabel('Multithread Rating')

# Plot efficient_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_cores', y='multithread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Cores vs Multithread Rating')
axes[1, 0].set_xlabel('Efficient Cores')
axes[1, 0].set_ylabel('Multithread Rating')

# Plot efficient_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_threads', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Threads vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Threads')
axes[1, 1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • TDP
TDP vs Performance¶
In [21]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate correlations
correlation_tdp_single = cpu_data_clone['tdp'].corr(cpu_data_clone['single_thread_rating'])
correlation_tdp_multi = cpu_data_clone['tdp'].corr(cpu_data_clone['multithread_rating'])

# Print the results
print(f"Correlation between TDP and single_thread_rating: {correlation_tdp_single:.2f}")
print(f"Correlation between TDP and multithread_rating: {correlation_tdp_multi:.2f}")
Correlation between TDP and single_thread_rating: 0.39
Correlation between TDP and multithread_rating: 0.43
In [22]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot TDP vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='single_thread_rating', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('TDP vs Single Thread Rating')
axes[0].set_xlabel('TDP (W)')
axes[0].set_ylabel('Single Thread Rating')

# Plot TDP vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='multithread_rating', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('TDP vs Multithread Rating')
axes[1].set_xlabel('TDP (W)')
axes[1].set_ylabel('Multithread Rating')

plt.tight_layout()
plt.show()
No description has been provided for this image
Efficiency Analysis¶
In [23]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()

# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')

# Calculate performance efficiency
cpu_data_clone['performance_efficiency'] = cpu_data_clone['multithread_rating'] / cpu_data_clone['tdp']

# Drop rows with NaN values in 'performance_efficiency'
cpu_data_clone = cpu_data_clone.dropna(subset=['performance_efficiency'])

# Filter out rows where 'performance_efficiency' is less than or equal to 0
cpu_data_clone = cpu_data_clone[cpu_data_clone['performance_efficiency'] > 0]

# Sort the DataFrame by 'performance_efficiency'
cpu_data_clone = cpu_data_clone.sort_values(by='performance_efficiency', ascending=False)

# Display the top 5 rows of the updated DataFrame
print("Top 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].head())

# Display the bottom 5 rows of the updated DataFrame
print("\nBottom 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].tail())

# Plot the distribution of performance efficiency
plt.figure(figsize=(10, 6))
sns.histplot(cpu_data_clone['performance_efficiency'], kde=True, color="blue", bins=30)
plt.title("Performance Efficiency Distribution", fontsize=16)
plt.xlabel("Performance Efficiency (multithread_rating / tdp)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
Top 5 rows:
                         name  multithread_rating   tdp  \
2113  intel core ultra 7 164u               15106   9.0   
2081            apple a18 pro               13020   8.0   
2090      intel core i7 1260u               14001   9.0   
2052      intel core i7 1250u               11654   9.0   
2331    amd ryzen ai 9 hx 370               35501  28.0   

      performance_efficiency  
2113             1678.444444  
2081             1627.500000  
2090             1555.666667  
2052             1294.888889  
2331             1267.892857  

Bottom 5 rows:
                             name  multithread_rating   tdp  \
70   mobile amd athlon xp-m 1800+                 193  45.0   
187    mobile amd athlon 64 3400+                 333  81.5   
181    mobile amd athlon 64 3200+                 326  81.5   
6              intel celeron b710                 106  35.0   
16   mobile intel celeron 1.80ghz                 121  66.1   

     performance_efficiency  
70                 4.288889  
187                4.085890  
181                4.000000  
6                  3.028571  
16                 1.830560  
No description has been provided for this image

GPU Dataframe¶

Preview the data¶

Dataframe head¶

In [24]:
# Display the first few rows
print(gpu_data.head())
                      name  avg_g3d_mark bus_interface  max_memory_size  \
0  rtx 2000 ada generation         15223          None              NaN   
1                rtx a4000         15394  pcie 4.0 x16          16384.0   
2          radeon rx 6800s         15436   pcie 4.0 x8           8192.0   
3         geforce rtx 3070         15439  pcie 4.0 x16           8192.0   
4        geforce gtx 675mx          2717  pcie 3.0 x16           4096.0   

   core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
0         NaN       None    None      NaN           195.0             90.0   
1       735.0       12_2     4.6    140.0           160.0            112.0   
2      1800.0         12     4.6    100.0           188.0             90.0   
3      1100.0         12     4.6    115.0           159.0            115.0   
4       667.0         11     4.5    100.0            59.0             13.0   

   test_directx_11  test_directx_12  test_gpu_compute  
0            130.0             66.0            5572.0  
1            133.0             66.0            6757.0  
2            133.0             64.0            6131.0  
3            137.0             65.0            6957.0  
4             21.0              9.0            1158.0  

Dataframe tail¶

In [25]:
# Display the last few rows
print(gpu_data.tail())
                        name  avg_g3d_mark bus_interface  max_memory_size  \
613          radeon rx 7900m         22752          None              NaN   
614  rtx 4000 ada generation         22879          None              NaN   
615  rtx 5000 ada generation         24197          None              NaN   
616         geforce rtx 4080         25099  pcie 4.0 x16          12288.0   
617         geforce rtx 4090         27668  pcie 4.0 x16          16384.0   

     core_clock max_direct open_gl  max_tdp  test_directx_9  test_directx_10  \
613         NaN       None    None      NaN           267.0            127.0   
614         NaN       None    None      NaN           270.0            139.0   
615         NaN       None    None      NaN           274.0            154.0   
616      1860.0       12_2     4.6    150.0           286.0            161.0   
617      1455.0       12_2     4.6    150.0           314.0            181.0   

     test_directx_11  test_directx_12  test_gpu_compute  
613            256.0             93.0            9297.0  
614            223.0            101.0            9224.0  
615            241.0            102.0            9666.0  
616            248.0             96.0           11446.0  
617            270.0            106.0           12653.0  

Check all the features¶

In [26]:
print(gpu_data.columns)
Index(['name', 'avg_g3d_mark', 'bus_interface', 'max_memory_size',
       'core_clock', 'max_direct', 'open_gl', 'max_tdp', 'test_directx_9',
       'test_directx_10', 'test_directx_11', 'test_directx_12',
       'test_gpu_compute'],
      dtype='object')

Check the data types and non-null counts¶

In [27]:
print(gpu_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              618 non-null    object 
 1   avg_g3d_mark      618 non-null    int64  
 2   bus_interface     349 non-null    object 
 3   max_memory_size   342 non-null    float64
 4   core_clock        309 non-null    float64
 5   max_direct        353 non-null    object 
 6   open_gl           346 non-null    object 
 7   max_tdp           245 non-null    float64
 8   test_directx_9    340 non-null    float64
 9   test_directx_10   340 non-null    float64
 10  test_directx_11   340 non-null    float64
 11  test_directx_12   340 non-null    float64
 12  test_gpu_compute  340 non-null    float64
dtypes: float64(8), int64(1), object(4)
memory usage: 62.9+ KB
None

Look at descriptive statistics¶

In [28]:
print(gpu_data.describe())
       avg_g3d_mark  max_memory_size   core_clock     max_tdp  test_directx_9  \
count    618.000000       342.000000   309.000000  245.000000      340.000000   
mean    2794.695793      2852.590643   756.132686   58.142857       65.000000   
std     4641.351905      3298.820120   366.364012   38.361524       67.759944   
min        2.000000         2.000000   143.000000    7.000000        1.000000   
25%      358.000000       512.000000   500.000000   25.000000       11.000000   
50%      671.500000      2048.000000   660.000000   50.000000       36.000000   
75%     2697.000000      4096.000000   954.000000   80.000000      108.250000   
max    27668.000000     16384.000000  2321.000000  165.000000      314.000000   

       test_directx_10  test_directx_11  test_directx_12  test_gpu_compute  
count       340.000000       340.000000       340.000000        340.000000  
mean         26.820588        38.591176        19.605882       1900.920588  
std          37.466998        51.947789        24.964588       2308.994291  
min           0.000000         0.000000         0.000000          0.000000  
25%           2.000000         4.000000         0.000000        239.500000  
50%           7.000000        15.000000         7.500000        806.000000  
75%          35.000000        54.750000        31.000000       2872.500000  
max         181.000000       270.000000       106.000000      12653.000000  

Feature Analysis¶

Clock Speed Analysis¶

Features:

  • core_clock
Distribution¶
In [29]:
# Plot the distribution of core_clock
plt.figure(figsize=(10, 6))
sns.histplot(gpu_data['core_clock'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of GPU Core Clock Speeds", fontsize=16)
plt.xlabel("Core Clock (MHz)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
Impact on Performance¶
In [30]:
# Calculate correlation coefficients
correlation_core_clock_avg_g3d_mark = gpu_data['core_clock'].corr(gpu_data['avg_g3d_mark'])
correlation_core_clock_test_directx_9 = gpu_data['core_clock'].corr(gpu_data['test_directx_9'])
correlation_core_clock_test_directx_10 = gpu_data['core_clock'].corr(gpu_data['test_directx_10'])
correlation_core_clock_test_directx_11 = gpu_data['core_clock'].corr(gpu_data['test_directx_11'])
correlation_core_clock_test_directx_12 = gpu_data['core_clock'].corr(gpu_data['test_directx_12'])
correlation_core_clock_test_gpu_compute = gpu_data['core_clock'].corr(gpu_data['test_gpu_compute'])

# Print correlation coefficients
print(f"Correlation between core_clock and avg_g3d_mark: {correlation_core_clock_avg_g3d_mark:.2f}")
print(f"Correlation between core_clock and test_directx_9: {correlation_core_clock_test_directx_9:.2f}")
print(f"Correlation between core_clock and test_directx_10: {correlation_core_clock_test_directx_10:.2f}")
print(f"Correlation between core_clock and test_directx_11: {correlation_core_clock_test_directx_11:.2f}")
print(f"Correlation between core_clock and test_directx_12: {correlation_core_clock_test_directx_12:.2f}")
print(f"Correlation between core_clock and test_gpu_compute: {correlation_core_clock_test_gpu_compute:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Plot core_clock vs avg_g3d_mark
sns.regplot(data=gpu_data, x='core_clock', y='avg_g3d_mark', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title(f"Core Clock vs Avg G3D Mark (Correlation: {correlation_core_clock_avg_g3d_mark:.2f})")

# Plot core_clock vs test_directx_9
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_9', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title(f"Core Clock vs Test DirectX 9 (Correlation: {correlation_core_clock_test_directx_9:.2f})")

# Plot core_clock vs test_directx_10
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_10', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title(f"Core Clock vs Test DirectX 10 (Correlation: {correlation_core_clock_test_directx_10:.2f})")

# Plot core_clock vs test_directx_11
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_11', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title(f"Core Clock vs Test DirectX 11 (Correlation: {correlation_core_clock_test_directx_11:.2f})")

# Plot core_clock vs test_directx_12
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_12', ax=axes[2, 0], color='orange', scatter_kws={'s': 10})
axes[2, 0].set_title(f"Core Clock vs Test DirectX 12 (Correlation: {correlation_core_clock_test_directx_12:.2f})")

# Plot core_clock vs test_gpu_compute
sns.regplot(data=gpu_data, x='core_clock', y='test_gpu_compute', ax=axes[2, 1], color='brown', scatter_kws={'s': 10})
axes[2, 1].set_title(f"Core Clock vs Test GPU Compute (Correlation: {correlation_core_clock_test_gpu_compute:.2f})")

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()
Correlation between core_clock and avg_g3d_mark: 0.71
Correlation between core_clock and test_directx_9: 0.70
Correlation between core_clock and test_directx_10: 0.63
Correlation between core_clock and test_directx_11: 0.68
Correlation between core_clock and test_directx_12: 0.69
Correlation between core_clock and test_gpu_compute: 0.68
No description has been provided for this image

Memory and Bandwidth Analysis¶

Features:

  • max_memory_size
  • bus_interface
Memory Size¶
In [31]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Define the memory size categories with handling for NaN values
def categorize_memory_size(memory_size):
    if pd.isna(memory_size):  # Check if the value is NaN
        return 'Unknown'
    elif memory_size <= 2048:
        return '<2GB'
    elif 2048 < memory_size <= 4096:
        return '2–4GB'
    elif 4096 < memory_size <= 8192:
        return '4–8GB'
    elif 8192 < memory_size <= 16384:
        return '8–16GB'
    else:
        return '>16GB'

# Apply the categorization function to the 'max_memory_size' column
gpu_data_clone['memory_size_category'] = gpu_data_clone['max_memory_size'].apply(categorize_memory_size)

# Group by the memory size category and calculate the average avg_g3d_mark
memory_size_comparison = gpu_data_clone.groupby('memory_size_category')['avg_g3d_mark'].mean()

# Exclude the 'Unknown' category from the comparison
memory_size_comparison = memory_size_comparison[memory_size_comparison.index != 'Unknown']

# Check the unique categories in the memory_size_comparison DataFrame
print("Unique categories in memory_size_comparison:", memory_size_comparison.index)

# Define the custom order of memory size categories
category_order = ['<2GB', '2–4GB', '4–8GB', '8–16GB', '>16GB']

# Ensure that the order only includes categories that are present in the data
category_order = [category for category in category_order if category in memory_size_comparison.index]

# Sort the memory_size_comparison based on the custom order
memory_size_comparison = memory_size_comparison[category_order]

# Print the results
print(memory_size_comparison)

# Plot the comparison
plt.figure(figsize=(10, 6))
memory_size_comparison.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Average G3D Mark by GPU Memory Size Category", fontsize=16)
plt.xlabel("Memory Size Category", fontsize=14)
plt.ylabel("Average G3D Mark", fontsize=14)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
Unique categories in memory_size_comparison: Index(['2–4GB', '4–8GB', '8–16GB', '<2GB'], dtype='object', name='memory_size_category')
memory_size_category
<2GB        579.054545
2–4GB      3843.720588
4–8GB     11465.523810
8–16GB    16140.750000
Name: avg_g3d_mark, dtype: float64
No description has been provided for this image
Bus Interface¶
In [32]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Filter out rows with missing bus_interface or avg_g3d_mark
filtered_gpu_data_clone = gpu_data_clone.dropna(subset=['bus_interface', 'avg_g3d_mark'])

# Group by bus_interface and calculate the average avg_g3d_mark
bus_interface_performance = filtered_gpu_data_clone.groupby('bus_interface')['avg_g3d_mark'].mean().sort_values()

# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
plt.title("Impact of Bus Interface on GPU Performance (avg_g3d_mark)", fontsize=16)
plt.xlabel("Average G3D Mark", fontsize=14)
plt.ylabel("Bus Interface", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
/tmp/ipykernel_538/3052634162.py:12: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
No description has been provided for this image

Power Consumption (TDP)¶

Features:

  • max_tdp
Performance vs Power¶
In [33]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'max_tdp' column is numeric
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Calculate correlation
correlation_tdp_g3d = gpu_data_clone['max_tdp'].corr(gpu_data_clone['avg_g3d_mark'])

# Print the correlation result
print(f"Correlation between max_tdp and avg_g3d_mark: {correlation_tdp_g3d:.2f}")

# Plot the relationship
plt.figure(figsize=(10, 6))
sns.regplot(data=gpu_data_clone, x='max_tdp', y='avg_g3d_mark', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Max TDP vs Avg G3D Mark", fontsize=16)
plt.xlabel("Max TDP (W)", fontsize=14)
plt.ylabel("Avg G3D Mark", fontsize=14)
plt.grid(True)

# Show the plot
plt.show()
Correlation between max_tdp and avg_g3d_mark: 0.75
No description has been provided for this image
Efficiency¶
In [34]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Ensure 'avg_g3d_mark' and 'max_tdp' columns are numeric
gpu_data_clone['avg_g3d_mark'] = pd.to_numeric(gpu_data_clone['avg_g3d_mark'], errors='coerce')
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')

# Exclude rows where 'max_tdp' is NaN
gpu_data_clone = gpu_data_clone.dropna(subset=['max_tdp'])

# Compute performance efficiency
gpu_data_clone['efficiency'] = gpu_data_clone['avg_g3d_mark'] / gpu_data_clone['max_tdp']

# Sort the DataFrame by 'efficiency'
gpu_data_sorted = gpu_data_clone.sort_values(by='efficiency', ascending=False)

# Display the top 5 rows of the sorted DataFrame
print("Top 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].head())

# Display the bottom 5 rows of the sorted DataFrame
print("\nBottom 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].tail())
Top 5 GPUs by Efficiency:
                 name  avg_g3d_mark  max_tdp  efficiency
509  radeon pro w6300          5560     25.0  222.400000
595   radeon rx 7600s         14679     75.0  195.720000
597   radeon rx 6700s         14969     80.0  187.112500
555  radeon pro 5600m          9233     50.0  184.660000
617  geforce rtx 4090         27668    150.0  184.453333

Bottom 5 GPUs by Efficiency:
                    name  avg_g3d_mark  max_tdp  efficiency
103       radeon hd 6320           147     45.0    3.266667
167  geforce go 7800 gtx           210     65.0    3.230769
84        radeon hd 6310           122     45.0    2.711111
63        radeon hd 6250            94     35.0    2.685714
70        radeon hd 6290           105     45.0    2.333333

Overall Performance Ratings¶

Features:

  • avg_g3d_mark (3DMark score)
  • test_gpu_compute (compute performance)
Distribution of ratings¶
In [35]:
# Plot the distribution of avg_g3d_mark
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['avg_g3d_mark'].dropna(), kde=True, color='blue', bins=30)
plt.title("Distribution of Avg G3D Mark", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()

# Plot the distribution of test_gpu_compute
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['test_gpu_compute'].dropna(), kde=True, color='green', bins=30)
plt.title("Distribution of Test GPU Compute", fontsize=16)
plt.xlabel("Test GPU Compute", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
Compute vs Gaming¶
In [36]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()

# Create performance categories based on avg_g3d_mark
bins = [0, 2000, 4000, 6000, 8000, 10000]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
gpu_data_clone['performance_category'] = pd.cut(gpu_data_clone['avg_g3d_mark'], bins=bins, labels=labels)

# Calculate correlation
correlation_gaming_compute = gpu_data_clone['avg_g3d_mark'].corr(gpu_data_clone['test_gpu_compute'])

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=gpu_data_clone, x='avg_g3d_mark', y='test_gpu_compute', hue='performance_category', alpha=0.7)

# Add titles and labels
plt.title(f"Avg G3D Mark vs Test GPU Compute (Correlation: {correlation_gaming_compute:.2f})", fontsize=16)
plt.xlabel("Avg G3D Mark (Gaming Performance)", fontsize=14)
plt.ylabel("Test GPU Compute (Compute Performance)", fontsize=14)
plt.grid(True)

# Show plot
plt.show()

# Print correlation
print(f"The correlation between avg_g3d_mark and test_gpu_compute is: {correlation_gaming_compute:.2f}")
No description has been provided for this image
The correlation between avg_g3d_mark and test_gpu_compute is: 0.99

Full Laptop Dataframe¶

Source (Laptop Shop)¶

Analyzing number of laptops from each source¶

In [37]:
# Get the unique values and their counts
source_counts = full_relation['laptop_specs_source'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(10, 6))
ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
plt.title("Number of laptops per shop", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Source", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_538/3058434414.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
No description has been provided for this image

Analysising price grouped by source¶

In [38]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand/source
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Source", fontsize=16)
plt.ylabel("Source", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_538/3580318046.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")
No description has been provided for this image
In [39]:
# Group by 'laptop_specs_source' and calculate descriptive statistics for 'laptop_specs_price'
price_stats_by_source = full_relation.groupby('laptop_specs_source')['laptop_specs_price'].describe()

# Print the statistics
print(price_stats_by_source)
                     count          mean           std         min  \
laptop_specs_source                                                  
cellphones           264.0  2.994519e+07  2.073170e+07   9490000.0   
fptshop              232.0  2.868957e+07  1.949101e+07   3590000.0   
gearvn               184.0  3.044543e+07  1.562514e+07   9490000.0   
hacom                489.0  2.441693e+07  1.206659e+07   8799000.0   
laptopaz             265.0  2.375830e+07  9.536130e+06  11990000.0   
laptopworld          287.0  4.186979e+07  2.161909e+07  14990000.0   
nguyenkim             91.0  2.023209e+07  9.465540e+06   8900000.0   
phongvu              649.0  3.362795e+07  2.288596e+07   9690000.0   
thegioididong        241.0  2.250369e+07  1.038162e+07   7990000.0   

                            25%         50%         75%          max  
laptop_specs_source                                                   
cellphones           16990000.0  23990000.0  35115000.0  182490000.0  
fptshop              16990000.0  22740000.0  31690000.0  128990000.0  
gearvn               20365000.0  25990000.0  36240000.0  117990000.0  
hacom                16499000.0  21199000.0  29499000.0   95699000.0  
laptopaz             17590000.0  22790000.0  26890000.0   66990000.0  
laptopworld          26990000.0  35990000.0  49390000.0  134190000.0  
nguyenkim            15990000.0  18990000.0  22240000.0   80990000.0  
phongvu              18990000.0  25990000.0  39990000.0  182390000.0  
thegioididong        16490000.0  19990000.0  26890000.0   96990000.0  

Brand¶

Analysing number of laptops from each brand¶

In [40]:
# Get the unique values and their counts
brand_counts = full_relation['laptop_specs_brand'].value_counts()

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
plt.title("Number of laptops per brand", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Brand", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_538/2418909123.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
No description has been provided for this image

Analysising price grouped by brand¶

In [41]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by brand
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Brand", fontsize=16)
plt.ylabel("Brand", fontsize=14)
plt.xlabel("Price", fontsize=14)

plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_538/2449895171.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")
No description has been provided for this image

Central Processing Unit (CPU)¶

Basic analysis¶

In [42]:
# Group by 'laptop_specs_cpu' and calculate the mean price and count
mean_price_by_cpu = full_relation.groupby('laptop_specs_cpu')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique CPUs:", mean_price_by_cpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_cpu['mean'] = mean_price_by_cpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 CPUs by Mean Price:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Mean Price:")
print(mean_price_by_cpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 CPUs by Count:")
print(mean_price_by_cpu.head(10), '\n\n')

print("Bottom 10 CPUs by Count:")
print(mean_price_by_cpu.tail(10), '\n\n')
Number of unique CPUs: 129

Top 10 CPUs by Mean Price:
                                  mean  count
laptop_specs_cpu                             
apple m3 max 16 core   128,140,000.00đ      6
apple m4 max 16 core   102,240,000.00đ      4
apple m2 max 12 core   101,756,666.67đ     15
apple m4 max 14 core    84,590,000.00đ      8
apple m3 max 14 core    84,440,000.00đ     10
intel core i9 13980hx   81,961,428.57đ      7
intel core i7 13850hx   75,399,500.00đ      4
intel core i9 13950hx   73,969,750.00đ      4
intel core i7 1365u     69,699,000.00đ      1
intel core i9 14900hx   69,551,428.57đ     70 


Bottom 10 CPUs by Mean Price:
                               mean  count
laptop_specs_cpu                          
amd ryzen 7 5825u    12,990,000.00đ      1
intel core i3 1305u  12,586,714.29đ     14
intel core i3 1315u  12,453,285.71đ     28
amd ryzen 7 5700u    12,230,538.46đ     13
amd ryzen 5 7520u    11,985,444.44đ     18
intel core i3 8145u  11,640,000.00đ      2
intel core i3 1215u  10,592,369.57đ     46
intel core i3 1220p  10,490,000.00đ      1
intel celeron n4500   8,490,000.00đ      2
intel celeron n4120   3,590,000.00đ      1 


Top 10 CPUs by Count:
                                   mean  count
laptop_specs_cpu                              
intel core i5 1335u      17,875,032.47đ    154
intel core ultra 7 155h  36,644,916.67đ    144
intel core i5 1235u      16,143,857.14đ    126
intel core i7 1355u      23,234,104.84đ    124
apple m2 8 core          34,058,632.08đ    106
intel core i7 13620h     27,316,057.14đ    105
intel core i5 12450h     17,762,419.35đ     93
intel core i5 12500h     21,572,036.14đ     83
apple m3 8 core          37,496,413.33đ     75
intel core i5 13420h     19,471,930.56đ     72 


Bottom 10 CPUs by Count:
                                mean  count
laptop_specs_cpu                           
intel core i5 1345u   24,999,000.00đ      1
amd ryzen 9 5900hx    18,390,000.00đ      1
intel core i5 10300h  14,990,000.00đ      1
intel core i7 1160g7  17,590,000.00đ      1
intel core i3 8130u   13,690,000.00đ      1
amd ryzen 5 2500u     14,139,000.00đ      1
amd ryzen 7 5825u     12,990,000.00đ      1
intel core i5 11300h  13,690,000.00đ      1
intel core i3 1220p   10,490,000.00đ      1
intel celeron n4120    3,590,000.00đ      1 


Analyzing CPU performance relation with price¶

In [43]:
# Calculate correlations
correlation_multithread_price = full_relation['cpu_specs_multithread_rating'].corr(full_relation['laptop_specs_price'])
correlation_single_thread_price = full_relation['cpu_specs_single_thread_rating'].corr(full_relation['laptop_specs_price'])

# Print the results
print(f"Correlation between multithread_rating and price: {correlation_multithread_price:.2f}")
print(f"Correlation between single_thread_rating and price: {correlation_single_thread_price:.2f}")
Correlation between multithread_rating and price: 0.64
Correlation between single_thread_rating and price: 0.59
In [44]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot single_thread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_single_thread_rating', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Single Thread Rating vs Price')
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Price')

# Plot multithread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_multithread_rating', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Multithread Rating vs Price')
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Price')

plt.tight_layout()
plt.show()
No description has been provided for this image

Graphics Processing Unit (GPU)¶

Basic analysis¶

In [45]:
# Group by 'laptop_specs_gpu' and calculate the mean price and count
mean_price_by_gpu = full_relation.groupby('laptop_specs_vga')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique GPUs:", mean_price_by_gpu.shape[0], end='\n\n')

# Sort the DataFrame by mean price
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='mean', ascending=False)

# Format the mean price as currency
mean_price_by_gpu['mean'] = mean_price_by_gpu['mean'].apply(lambda x: f"{x:,.2f}đ")

# Display the results
print("Top 10 GPUs by Mean Price:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Mean Price:")
print(mean_price_by_gpu.tail(10), '\n\n')


# Sort the DataFrame by count
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='count', ascending=False)

# Display the results
print("Top 10 GPUs by Count:")
print(mean_price_by_gpu.head(10), '\n\n')

print("Bottom 10 GPUs by Count:")
print(mean_price_by_gpu.tail(10), '\n\n')
Number of unique GPUs: 21

Top 10 GPUs by Mean Price:
                                    mean  count
laptop_specs_vga                               
geforce rtx 4090         105,975,714.29đ     21
geforce rtx 4080          84,420,000.00đ     30
rtx 2000 ada generation   81,022,916.67đ     12
rtx a1000                 66,099,500.00đ      4
geforce rtx 4070          52,101,764.71đ     85
rtx a500                  49,051,888.89đ      9
rtx 500 ada generation    48,240,000.00đ      2
geforce rtx 3070 ti       39,840,000.00đ      4
geforce rtx 4060          35,624,334.62đ    260
geforce rtx 3060          28,675,714.29đ     28 


Bottom 10 GPUs by Mean Price:
                               mean  count
laptop_specs_vga                          
geforce rtx 3070     28,290,000.00đ      6
geforce rtx 3050 ti  27,331,935.48đ     31
geforce mx570        26,062,666.67đ      3
radeon rx 7600s      23,323,333.33đ      3
geforce rtx 3050     22,468,951.61đ    186
geforce mx450        20,942,250.00đ      4
geforce mx550        20,144,529.41đ     17
geforce rtx 2050     18,800,204.92đ    122
geforce gtx 1650     17,940,000.00đ      8
radeon rx 6550m      15,740,000.00đ      4 


Top 10 GPUs by Count:
                                mean  count
laptop_specs_vga                           
geforce rtx 4060      35,624,334.62đ    260
geforce rtx 4050      28,432,912.84đ    218
geforce rtx 3050      22,468,951.61đ    186
geforce rtx 2050      18,800,204.92đ    122
geforce rtx 4070      52,101,764.71đ     85
geforce rtx 3050 ti   27,331,935.48đ     31
geforce rtx 4080      84,420,000.00đ     30
geforce rtx 3060      28,675,714.29đ     28
geforce rtx 4090     105,975,714.29đ     21
geforce mx550         20,144,529.41đ     17 


Bottom 10 GPUs by Count:
                                  mean  count
laptop_specs_vga                             
rtx a500                49,051,888.89đ      9
geforce gtx 1650        17,940,000.00đ      8
geforce rtx 3070        28,290,000.00đ      6
rtx a1000               66,099,500.00đ      4
geforce mx450           20,942,250.00đ      4
geforce rtx 3070 ti     39,840,000.00đ      4
radeon rx 6550m         15,740,000.00đ      4
geforce mx570           26,062,666.67đ      3
radeon rx 7600s         23,323,333.33đ      3
rtx 500 ada generation  48,240,000.00đ      2 


Analyzing GPU performance relation with price¶

In [46]:
# Calculate the correlation between avg_g3d_mark and price
correlation_avg_g3d_mark_price = full_relation['gpu_specs_avg_g3d_mark'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between avg_g3d_mark and price: {correlation_avg_g3d_mark_price:.2f}")
Correlation between avg_g3d_mark and price: 0.70
In [47]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='gpu_specs_avg_g3d_mark', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Correlation between Avg G3D Mark and Price", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Access Memory (RAM)¶

Basic analysis¶

In [48]:
# Print unique values and their counts for RAM amount
ram_amount_counts = full_relation['laptop_specs_ram_amount'].value_counts()
print("Unique RAM amounts and their counts:")
print(ram_amount_counts)

# Print unique values and their counts for RAM type
ram_type_counts = full_relation['laptop_specs_ram_type'].value_counts()
print("\nUnique RAM types and their counts:")
print(ram_type_counts)
Unique RAM amounts and their counts:
laptop_specs_ram_amount
16.0     1458
8.0       751
32.0      269
24.0       53
12.0       43
4.0        34
64.0       22
36.0       22
512.0      15
48.0       10
18.0        8
96.0        4
128.0       3
40.0        1
Name: count, dtype: int64

Unique RAM types and their counts:
laptop_specs_ram_type
ddr5    1374
ddr4    1007
Name: count, dtype: int64
In [49]:
# Convert RAM amount to categorical type
full_relation['laptop_specs_ram_amount'] = pd.Categorical(full_relation['laptop_specs_ram_amount'])

# Plot the unique values and their counts horizontally
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by RAM Amount", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("RAM Amount (GB)", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_538/3967087379.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
No description has been provided for this image
In [50]:
# Plot the pie chart for RAM types
plt.figure(figsize=(8, 8))
ram_type_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99'], labels=ram_type_counts.index, wedgeprops=dict(width=0.3))

# Add title
plt.title("Distribution of RAM Types", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Analyzing RAM performance relation with price¶

In [51]:
# Calculate the correlation between RAM amount and price
correlation_ram_price = full_relation['laptop_specs_ram_amount'].astype(float).corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between RAM amount and price: {correlation_ram_price:.2f}")
Correlation between RAM amount and price: 0.26
In [52]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by RAM amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Amount", fontsize=16)
plt.xlabel("RAM Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_538/1837874617.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")
No description has been provided for this image
In [53]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a KDE plot for price distribution by RAM type
plt.figure(figsize=(14, 8))
sns.kdeplot(data=full_relation, x='laptop_specs_price', hue='laptop_specs_ram_type', fill=True, palette="viridis")

# Add titles and labels
plt.title("Price Distribution by RAM Type", fontsize=16)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Storage¶

Basic analysis¶

In [54]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()
# Convert 'laptop_specs_storage_amount' to numeric type
full_relation_clone['laptop_specs_storage_amount'] = pd.to_numeric(full_relation_clone['laptop_specs_storage_amount'], errors='coerce')

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_storage_amount'] >= 128]

# Print unique values and their counts for storage amount
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
print("Unique storage amounts and their counts:")
print(storage_amount_counts)

# Print unique values and their counts for storage type
storage_type_counts = full_relation_clone['laptop_specs_storage_type'].value_counts()
print("\nUnique storage types and their counts:")
print(storage_type_counts)
Unique storage amounts and their counts:
laptop_specs_storage_amount
512.0     1653
1024.0     466
256.0      183
2048.0      53
8192.0       3
4096.0       3
Name: count, dtype: int64

Unique storage types and their counts:
laptop_specs_storage_type
ssd    2299
hdd       4
Name: count, dtype: int64
In [55]:
# Convert storage amount to categorical type
full_relation_clone['laptop_specs_storage_amount'] = pd.Categorical(full_relation_clone['laptop_specs_storage_amount'])

# Plot the unique values and their counts horizontally
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_538/4095148162.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
No description has been provided for this image

Analyzing Storage relation with price¶

In [56]:
# Calculate the correlation between storage amount and price
correlation_storage_price = full_relation_clone['laptop_specs_storage_amount'].astype(float).corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between storage amount and price: {correlation_storage_price:.2f}")
Correlation between storage amount and price: 0.66
In [57]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by storage amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_538/4036590408.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")
No description has been provided for this image

Screen Features¶

Basic analysis¶

In [58]:
# Calculate summary statistics for screen size, refresh rate, and brightness
screen_size_stats = full_relation['laptop_specs_screen_size'].describe()
refresh_rate_stats = full_relation['laptop_specs_screen_refresh_rate'].describe()
brightness_stats = full_relation['laptop_specs_screen_brightness'].describe()

# Print the results
print("Summary Statistics for Screen Size:")
print(screen_size_stats)

print("\nSummary Statistics for Screen Refresh Rate:")
print(refresh_rate_stats)

print("\nSummary Statistics for Screen Brightness:")
print(brightness_stats)
Summary Statistics for Screen Size:
count    2677.000000
mean       15.067762
std         0.990302
min        11.600000
25%        14.000000
50%        15.600000
75%        15.600000
max        18.000000
Name: laptop_specs_screen_size, dtype: float64

Summary Statistics for Screen Refresh Rate:
count    1630.000000
mean      126.676687
std        51.595981
min        60.000000
25%        60.000000
50%       120.000000
75%       144.000000
max       480.000000
Name: laptop_specs_screen_refresh_rate, dtype: float64

Summary Statistics for Screen Brightness:
count    1502.000000
mean      350.348202
std       132.497193
min       220.000000
25%       250.000000
50%       300.000000
75%       400.000000
max      1200.000000
Name: laptop_specs_screen_brightness, dtype: float64
In [59]:
# Print unique values and their counts for screen resolution
screen_resolution_counts = full_relation['laptop_specs_screen_resolution'].value_counts()
print("Unique screen resolutions and their counts:")
print(screen_resolution_counts)

# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
plt.title("Number of Laptops by Screen Resolution", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Screen Resolution", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add data labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

# Show the plot
plt.show()
/tmp/ipykernel_538/3570925166.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
Unique screen resolutions and their counts:
laptop_specs_screen_resolution
1920x1080    1330
1920x1200     395
2560x1600     274
2880x1800     196
3024x1964      86
2560x1664      71
3456x2234      43
2880x1864      31
3840x2400      26
3200x2000      23
2880x1620      20
2560x1440      19
2880x1920      18
3072x1920      13
2560x1644      10
2240x1400      10
2048x1280       9
1366x768        9
3456x2160       4
2800x1800       3
1980x1080       2
3000x2000       2
2800x1620       1
1920x1280       1
2960x1848       1
2160x1440       1
3201x2000       1
2220x1080       1
2256x1504       1
1900x1200       1
Name: count, dtype: int64
No description has been provided for this image

Analysis of screen features with price¶

In [60]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()

# Get the counts of each screen resolution
screen_resolution_counts = full_relation_clone['laptop_specs_screen_resolution'].value_counts()

# Filter out screen resolutions with count < 20
filtered_screen_resolutions = screen_resolution_counts[screen_resolution_counts >= 20].index

# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_screen_resolution'].isin(filtered_screen_resolutions)]

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by screen resolution
# Sort the DataFrame by screen resolution
full_relation_clone = full_relation_clone.sort_values(by='laptop_specs_screen_resolution')

plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Screen Resolution", fontsize=16)
plt.ylabel("Screen Resolution", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_538/1749291397.py:21: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")
No description has been provided for this image
In [61]:
# Print the correlation
correlation_screen_size_price = full_relation['laptop_specs_screen_size'].corr(full_relation['laptop_specs_price'])
correlation_refresh_rate_price = full_relation['laptop_specs_screen_refresh_rate'].corr(full_relation['laptop_specs_price'])
correlation_brightness_price = full_relation['laptop_specs_screen_brightness'].corr(full_relation['laptop_specs_price'])

print(f"Correlation between screen size and price: {correlation_screen_size_price:.2f}")
print(f"Correlation between screen refresh rate and price: {correlation_refresh_rate_price:.2f}")
print(f"Correlation between screen brightness and price: {correlation_brightness_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot price vs. screen size
sns.regplot(data=full_relation, x='laptop_specs_screen_size', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'alpha':0.7})
axes[0].set_title('Price vs. Screen Size')
axes[0].set_xlabel('Screen Size (inches)')
axes[0].set_ylabel('Price (VND)')

# Plot price vs. screen refresh rate
sns.regplot(data=full_relation, x='laptop_specs_screen_refresh_rate', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'alpha':0.7})
axes[1].set_title('Price vs. Screen Refresh Rate')
axes[1].set_xlabel('Screen Refresh Rate (Hz)')
axes[1].set_ylabel('Price (VND)')

# Plot price vs. screen brightness
sns.regplot(data=full_relation, x='laptop_specs_screen_brightness', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'alpha':0.7})
axes[2].set_title('Price vs. Screen Brightness')
axes[2].set_xlabel('Screen Brightness (nits)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
Correlation between screen size and price: 0.19
Correlation between screen refresh rate and price: 0.36
Correlation between screen brightness and price: 0.54
No description has been provided for this image

Portability Features¶

Weight¶

Basic analysis

In [62]:
# Print summary statistics for weight
weight_stats = full_relation['laptop_specs_weight'].describe()
print("Summary Statistics for Weight:")
print(weight_stats)
Summary Statistics for Weight:
count    2525.000000
mean        1.825792
std         0.460089
min         0.879000
25%         1.460000
50%         1.700000
75%         2.200000
max         4.000000
Name: laptop_specs_weight, dtype: float64
In [63]:
# Plot the distribution of laptop weights
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_weight'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Weights", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of weight with price

In [64]:
# Calculate the correlation between weight and price
correlation_weight_price = full_relation['laptop_specs_weight'].corr(full_relation['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between weight and price: {correlation_weight_price:.2f}")
Correlation between weight and price: 0.28
In [65]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_weight', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Weight vs Price", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Length, Width, Height¶

Basic analysis

In [66]:
# Calculate summary statistics for length, width, and height
length_stats = full_relation['laptop_specs_height'].describe()
width_stats = full_relation['laptop_specs_width'].describe()
height_stats = full_relation['laptop_specs_depth'].describe()

# Print the results
print("Summary Statistics for Length:")
print(length_stats)

print("\nSummary Statistics for Width:")
print(width_stats)

print("\nSummary Statistics for Height:")
print(height_stats)
Summary Statistics for Length:
count    1998.000000
mean        2.000681
std         0.925657
min         0.670000
25%         1.690000
50%         1.900000
75%         2.230000
max        22.700000
Name: laptop_specs_height, dtype: float64

Summary Statistics for Width:
count    1998.000000
mean       34.128338
std         2.453755
min        22.810000
25%        31.440000
50%        35.570000
75%        35.940000
max        52.300000
Name: laptop_specs_width, dtype: float64

Summary Statistics for Height:
count    1998.000000
mean       23.661036
std         1.934277
min         3.000000
25%        22.120000
50%        23.560000
75%        25.100000
max        32.000000
Name: laptop_specs_depth, dtype: float64
In [67]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot the distribution of length
sns.histplot(full_relation['laptop_specs_height'].dropna(), kde=True, color='blue', bins=30, ax=axes[0])
axes[0].set_title("Distribution of Laptop Length", fontsize=16)
axes[0].set_xlabel("Length (cm)", fontsize=14)
axes[0].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of width
sns.histplot(full_relation['laptop_specs_width'].dropna(), kde=True, color='green', bins=30, ax=axes[1])
axes[1].set_title("Distribution of Laptop Width", fontsize=16)
axes[1].set_xlabel("Width (cm)", fontsize=14)
axes[1].set_ylabel("Frequency", fontsize=14)

# Plot the distribution of height
sns.histplot(full_relation['laptop_specs_depth'].dropna(), kde=True, color='red', bins=30, ax=axes[2])
axes[2].set_title("Distribution of Laptop Height", fontsize=16)
axes[2].set_xlabel("Height (cm)", fontsize=14)
axes[2].set_ylabel("Frequency", fontsize=14)

plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis of dimensions with price

In [68]:
# Calculate the correlation between length, width, height, and price
correlation_length_price = full_relation['laptop_specs_height'].corr(full_relation['laptop_specs_price'])
correlation_width_price = full_relation['laptop_specs_width'].corr(full_relation['laptop_specs_price'])
correlation_height_price = full_relation['laptop_specs_depth'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between length and price: {correlation_length_price:.2f}")
print(f"Correlation between width and price: {correlation_width_price:.2f}")
print(f"Correlation between height and price: {correlation_height_price:.2f}")
Correlation between length and price: 0.09
Correlation between width and price: -0.02
Correlation between height and price: 0.21
In [69]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot length vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_height', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Length vs Price')
axes[0].set_xlabel('Length (cm)')
axes[0].set_ylabel('Price (VND)')

# Plot width vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_width', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Width vs Price')
axes[1].set_xlabel('Width (cm)')
axes[1].set_ylabel('Price (VND)')

# Plot height vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_depth', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'s': 10})
axes[2].set_title('Height vs Price')
axes[2].set_xlabel('Height (cm)')
axes[2].set_ylabel('Price (VND)')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [70]:
# Calculate the product of length, width, and height
full_relation_clone['volume'] = full_relation_clone['laptop_specs_height'] * full_relation_clone['laptop_specs_width'] * full_relation_clone['laptop_specs_depth']

# Calculate the correlation between volume and price
correlation_volume_price = full_relation_clone['volume'].corr(full_relation_clone['laptop_specs_price'])

# Print the correlation result
print(f"Correlation between volume and price: {correlation_volume_price:.2f}")

# Plot the correlation between volume and price
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation_clone, x='volume', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Volume vs Price", fontsize=16)
plt.xlabel("Volume (cm³)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between volume and price: 0.13
No description has been provided for this image

Battery and Power¶

Basic Analysis¶

In [71]:
# Calculate summary statistics for battery amount and battery cells
battery_amount_stats = full_relation['laptop_specs_battery_capacity'].describe()
battery_cells_stats = full_relation['laptop_specs_battery_cells'].describe()

# Print the results
print("Summary Statistics for Battery Capacity:")
print(battery_amount_stats)

print("\nSummary Statistics for Battery Cells:")
print(battery_cells_stats)
Summary Statistics for Battery Capacity:
count    2372.000000
mean       60.717487
std        16.856594
min        30.000000
25%        50.000000
50%        57.000000
75%        71.000000
max       120.000000
Name: laptop_specs_battery_capacity, dtype: float64

Summary Statistics for Battery Cells:
count    1747.000000
mean        3.514596
std         0.653819
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         6.000000
Name: laptop_specs_battery_cells, dtype: float64
In [72]:
# Plot the distribution of battery capacity
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_battery_capacity'].dropna(), kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Battery Capacity", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image

Analysis of battery and power features with price

In [73]:
# Calculate the correlation between battery capacity and price
correlation_battery_capacity_price = full_relation['laptop_specs_battery_capacity'].corr(full_relation['laptop_specs_price'])

# Calculate the correlation between battery cells and price
correlation_battery_cells_price = full_relation['laptop_specs_battery_cells'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between battery capacity and price: {correlation_battery_capacity_price:.2f}")
print(f"Correlation between battery cells and price: {correlation_battery_cells_price:.2f}")
Correlation between battery capacity and price: 0.66
Correlation between battery cells and price: 0.49
In [74]:
# Set the plot style
sns.set_theme(style="whitegrid")

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_battery_capacity', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})

# Add titles and labels
plt.title("Battery Capacity vs Price", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image

Connectivity Features¶

Basic analysis¶

In [97]:
# Print unique values and their counts for number of USB-A ports
usb_a_counts = full_relation['laptop_specs_number_usb_a_ports'].value_counts()
print("Unique values and counts for number of USB-A ports:")
print(usb_a_counts)

# Print unique values and their counts for number of USB-C ports
usb_c_counts = full_relation['laptop_specs_number_usb_c_ports'].value_counts()
print("\nUnique values and counts for number of USB-C ports:")
print(usb_c_counts)

# Print unique values and their counts for number of HDMI ports
hdmi_counts = full_relation['laptop_specs_number_hdmi_ports'].value_counts()
print("\nUnique values and counts for number of HDMI ports:")
print(hdmi_counts)

# Print unique values and their counts for number of Ethernet ports
ethernet_counts = full_relation['laptop_specs_number_ethernet_ports'].value_counts()
print("\nUnique values and counts for number of Ethernet ports:")
print(ethernet_counts)

# Print unique values and their counts for number of audio jacks
audio_jack_counts = full_relation['laptop_specs_number_audio_jacks'].value_counts()
print("\nUnique values and counts for number of audio jacks:")
print(audio_jack_counts)
Unique values and counts for number of USB-A ports:
laptop_specs_number_usb_a_ports
0.0    1309
2.0     570
3.0     367
1.0     192
4.0      21
6.0       5
5.0       2
Name: count, dtype: int64

Unique values and counts for number of USB-C ports:
laptop_specs_number_usb_c_ports
1.0    1125
2.0     697
0.0     514
3.0     105
4.0      25
Name: count, dtype: int64

Unique values and counts for number of HDMI ports:
laptop_specs_number_hdmi_ports
1.0    2121
0.0     345
Name: count, dtype: int64

Unique values and counts for number of Ethernet ports:
laptop_specs_number_ethernet_ports
0.0    2071
1.0     395
Name: count, dtype: int64

Unique values and counts for number of audio jacks:
laptop_specs_number_audio_jacks
0.0    1508
1.0     958
Name: count, dtype: int64
In [101]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure and axes
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

# Plot the pie chart for number of USB-A ports
usb_a_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[0],
    colors=sns.color_palette('pastel', len(usb_a_counts)),
    labels=None  # Remove labels
)
axes[0].set_title("Distribution of USB-A Ports")
axes[0].set_ylabel('')
axes[0].legend(usb_a_counts.index, title="USB-A Ports", loc="best")

# Plot the pie chart for number of USB-C ports
usb_c_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[1],
    colors=sns.color_palette('pastel', len(usb_c_counts)),
    labels=None  # Remove labels
)
axes[1].set_title("Distribution of USB-C Ports")
axes[1].set_ylabel('')
axes[1].legend(usb_c_counts.index, title="USB-C Ports", loc="best")

# Plot the pie chart for number of HDMI ports
hdmi_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[2],
    colors=sns.color_palette('pastel', len(hdmi_counts)),
    labels=None  # Remove labels
)
axes[2].set_title("Distribution of HDMI Ports")
axes[2].set_ylabel('')
axes[2].legend(hdmi_counts.index, title="HDMI Ports", loc="best")

# Plot the pie chart for number of Ethernet ports
ethernet_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[3],
    colors=sns.color_palette('pastel', len(ethernet_counts)),
    labels=None  # Remove labels
)
axes[3].set_title("Distribution of Ethernet Ports")
axes[3].set_ylabel('')
axes[3].legend(ethernet_counts.index, title="Ethernet Ports", loc="best")

# Plot the pie chart for number of Audio Jacks
audio_jack_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    ax=axes[4],
    colors=sns.color_palette('pastel', len(audio_jack_counts)),
    labels=None  # Remove labels
)
axes[4].set_title("Distribution of Audio Jacks")
axes[4].set_ylabel('')
axes[4].legend(audio_jack_counts.index, title="Audio Jacks", loc="best")

# Remove the last empty subplot
fig.delaxes(axes[5])

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image

Analysis connectivity to price¶

In [103]:
# Calculate the correlation between connectivity features and price
correlation_usb_a_price = full_relation['laptop_specs_number_usb_a_ports'].corr(full_relation['laptop_specs_price'])
correlation_usb_c_price = full_relation['laptop_specs_number_usb_c_ports'].corr(full_relation['laptop_specs_price'])
correlation_hdmi_price = full_relation['laptop_specs_number_hdmi_ports'].corr(full_relation['laptop_specs_price'])
correlation_ethernet_price = full_relation['laptop_specs_number_ethernet_ports'].corr(full_relation['laptop_specs_price'])
correlation_audio_jack_price = full_relation['laptop_specs_number_audio_jacks'].corr(full_relation['laptop_specs_price'])

# Print the correlation results
print(f"Correlation between number of USB-A ports and price: {correlation_usb_a_price:.2f}")
print(f"Correlation between number of USB-C ports and price: {correlation_usb_c_price:.2f}")
print(f"Correlation between number of HDMI ports and price: {correlation_hdmi_price:.2f}")
print(f"Correlation between number of Ethernet ports and price: {correlation_ethernet_price:.2f}")
print(f"Correlation between number of audio jacks and price: {correlation_audio_jack_price:.2f}")
Correlation between number of USB-A ports and price: -0.10
Correlation between number of USB-C ports and price: 0.20
Correlation between number of HDMI ports and price: -0.17
Correlation between number of Ethernet ports and price: -0.07
Correlation between number of audio jacks and price: -0.02

Software Features¶

Default OS¶

Basic analysis

In [81]:
# Print unique values and their counts for default OS
os_counts = full_relation['laptop_specs_default_os'].value_counts()

# Replace 'window' with 'windows' in the 'laptop_specs_default_os' column
full_relation['laptop_specs_default_os'] = full_relation['laptop_specs_default_os'].apply(lambda x: 'windows' if x is not None and 'window' in x.lower() else x)

# Print the updated unique OS and their counts
os_counts = full_relation['laptop_specs_default_os'].value_counts()
print("Unique OS and their counts:")
print(os_counts)
Unique OS and their counts:
laptop_specs_default_os
windows      2283
macos         296
linux          26
chrome os       2
Name: count, dtype: int64
In [90]:
# Plot the pie chart for default OS
plt.figure(figsize=(8, 8))
os_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    startangle=140,
    colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999'],
    labels=None,  # Remove labels from the pie chart
    wedgeprops=dict(width=0.3),
    textprops={'fontsize': 10}  # Adjust text size
)

# Add a legend for categories
plt.legend(os_counts.index, loc="best")

# Add title
plt.title("Distribution of Default OS", fontsize=16)

# Show the plot
plt.show()
No description has been provided for this image

Warranty¶

In [91]:
# Print unique values and their counts for warranty
warranty_counts = full_relation['laptop_specs_warranty'].value_counts()
print("Unique warranty values and their counts:")
print(warranty_counts)
Unique warranty values and their counts:
laptop_specs_warranty
12.0    1288
24.0     920
36.0     105
18.0       2
Name: count, dtype: int64
In [93]:
# Print correlation
correlation_warranty_price = full_relation['laptop_specs_warranty'].corr(full_relation['laptop_specs_price'])
print(f"Correlation between warranty and price: {correlation_warranty_price:.2f}")

# Set the plot style
sns.set_theme(style="whitegrid")

# Create a boxplot for price distribution by warranty
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Price Distribution by Warranty", fontsize=16)
plt.xlabel("Warranty (months)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
Correlation between warranty and price: 0.08
/tmp/ipykernel_538/2459959301.py:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")
No description has been provided for this image

Target Feature: price¶

Basic statistics

In [ ]:
# Calculate basic statistics for the price column
price_stats = full_relation['laptop_specs_price'].describe()

# Print the statistics
print("Basic Statistics for Price:")
print(price_stats)
Basic Statistics for Price:
count    2.702000e+03
mean     2.942449e+07
std      1.869344e+07
min      3.590000e+06
25%      1.799000e+07
50%      2.399000e+07
75%      3.464000e+07
max      1.824900e+08
Name: laptop_specs_price, dtype: float64

Visualizing the distribution

In [ ]:
# Plot the distribution of laptop prices
plt.figure(figsize=(12, 6))
sns.histplot(full_relation['laptop_specs_price'], kde=True, color='blue', bins=30)

# Add labels and title
plt.title("Distribution of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")

# Add titles and labels
plt.title("Boxplot of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_18917/2849463995.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")
No description has been provided for this image